
## Set up well production profiles
load(merged_panel_data_loc) # load in merged dt.panel
max.date.by.well = dt.panel[, max(prod_date), keyby=entity_id]
max.date.by.well[,V1:=as.Date(V1)]

dt.panel[, max(prod_date), by=state][order(V1)]

# The data is current through at least Dec 2018. Should not be imputing zeros after that.
max.date = as.Date('2018-12-01')
# If the max date is exactly Dec 2018, then we DO NOT impute, since it's complete. Only
# strictly before

impute.entities = max.date.by.well[V1<max.date, entity_id] 
non.impute.entities = max.date.by.well[V1>=max.date, entity_id] 

dt.panel.non.impute = dt.panel[J(non.impute.entities)]
dt.panel.impute = dt.panel[J(impute.entities)]
rm(dt.panel)

dt.panel.non.impute[, imputed.zero:=0]
dt.panel.impute[, imputed.zero:=0]

dt.panel.impute[, n := 1:.N, by=entity_id]

dt.panel.impute.collapse = dt.panel.impute[n==1]
dt.panel.impute.collapse[, n:=NULL]
dt.panel.impute[, n:=NULL]

setkey(dt.panel.impute.collapse, entity_id)
dt.panel.impute.collapse[, c('prod_date','liq','gas','imputed.zero'):=NULL]
dt.panel.impute.collapse[, liq:=0]
dt.panel.impute.collapse[, gas:=0]
dt.panel.impute.collapse[, imputed.zero:=1]

# for (i in 1:length(impute.entities)) {
impute.fn = function(id, max.date.by.well, max.date, dt.panel.impute.collapse) {
  # id = impute.entities[i]
  start.date = max.date.by.well[J(id), V1+months(1)]
  # if starting imputation date is after Dec 2018, shouldn't be imputing. (Ok to impute on Dec 2018 specifically.)
  if (start.date>max.date) stop("Date mismatch. This shouldn't happen.") 
  
  # Find relevant range
  date.range = seq.Date(from=start.date, 
                        to=max.date, by='months')
  # Create new dt.panel with new date range, imputed zeros
  dt.fill = cbind(prod_date=date.range, 
                  dt.panel.impute.collapse[J(id)])
  return(dt.fill)
}

# Parallelize
library(parallel)
library(pbapply)
numcores = detectCores()

st.time = Sys.time()
st.time

cl = makeCluster(numcores)

# Load necessary packages on cluster
clusterEvalQ(cl, {library(lubridate); library(data.table)}) 
# Export to cluster
clusterExport(cl, varlist=c('max.date.by.well','max.date','dt.panel.impute.collapse'))

mc.out = pblapply(cl=cl, X=impute.entities, FUN=impute.fn, 
                  max.date.by.well=max.date.by.well, 
                  max.date=max.date, 
                  dt.panel.impute.collapse=dt.panel.impute.collapse)

names(mc.out) = impute.entities
stopCluster(cl)

dt.panel.impute.future = do.call(rbind, mc.out)

ed.time = Sys.time()
ed.time
difftime(ed.time, st.time)

# Merge with existing data.
setcolorder(dt.panel.impute.future, names(dt.panel.impute))
all(names(dt.panel.impute)==names(dt.panel.impute.future))
dt.panel.impute[, prod_date:=as.Date(prod_date)]

dt.panel.impute = rbind(dt.panel.impute, dt.panel.impute.future)
dt.panel.impute = dt.panel.impute[order(entity_id, prod_date)]

dt.panel.non.impute[, prod_date:=as.Date(prod_date)]

dt.panel = rbind(dt.panel.non.impute, dt.panel.impute)
dt.panel[,.N] 
setkey(dt.panel, entity_id)
rm(list=c('dt.panel.non.impute','dt.panel.impute','dt.panel.impute.collapse'))
Sys.time()

dt.panel[, WellAge := elapsed_months(prod_date, first_prod_month)]

dt.panel = dt.panel[order(entity_id, prod_date)]

# Save production profiles
profiles = dt.panel[, .(liq.daily.mean=mean(liq)/(365.25/12), gas.daily.mean=mean(gas)/(365.25/12), .N, share.imputed=mean(imputed.zero)), 
                    by=.(fed.land, prod_type, offshore, WellAge)]
profiles = profiles[order(fed.land, prod_type, offshore, WellAge)]

profiles.2009.plus = dt.panel[year(first_prod_month)>=2009,
                              .(liq.daily.mean=mean(liq)/(365.25/12), gas.daily.mean=mean(gas)/(365.25/12), .N, share.imputed=mean(imputed.zero)), 
                              by=.(fed.land, prod_type, offshore, WellAge)]
profiles.2009.plus = profiles.2009.plus[order(fed.land, prod_type, offshore, WellAge)]

profiles.2014.plus = dt.panel[year(first_prod_month)>=2014,
                              .(liq.daily.mean=mean(liq)/(365.25/12), gas.daily.mean=mean(gas)/(365.25/12), .N, share.imputed=mean(imputed.zero)), 
                              by=.(fed.land, prod_type, offshore, WellAge)]
profiles.2014.plus = profiles.2014.plus[order(fed.land, prod_type, offshore, WellAge)]

profiles; profiles.2009.plus; profiles.2014.plus

profiles[prod_type=='Oil' & offshore==0][order(WellAge)]
profiles.2009.plus[prod_type=='Oil' & offshore==0][order(WellAge)]
profiles.2014.plus[prod_type=='Oil' & offshore==0][order(WellAge)]

save(list=c('profiles', 'profiles.2009.plus', 'profiles.2014.plus'), file=paste0(working.data.dir,'/production_profiles_',today(),'.RData'))
production_profile_data_loc = paste0(working.data.dir,'/production_profiles_',today(),'.RData')
profiles
profiles.2009.plus
profiles.2014.plus
rm(dt.panel); gc()

